---
title: "Van Egeren et al (2018) fitness distribution similarity hypothesis testing"
output: html_notebook
---

This script was used to calculate the z-scores and p-values for the goodness-of-fit Z test used to assess similarity between the simulated fitness distributions and the experimentally measured distributions. See Fig. 2d and Methods: Comparison of simulated and experimentally-measured fitness distributions for more information.

```{r include=FALSE}
library(readr)
library(ggplot2)
library(reshape2)
```

```{r include=FALSE}
filepath = "./"
var = 4
num_bootstrap = 1000

all_dat_lgnorm = c()
for (j in 1:num_bootstrap){
    filename = paste(filepath, "lognorm_ss_dist/10/", var, "/fit_sim_",j,"type_0.oevo", sep="")
    temp = read_csv(filename, skip=1, col_names=FALSE)
    est_mean = mean(unlist(t(temp[1,2:101])))
    to_plot = unlist(t(temp[1,2:101]))/est_mean
    all_dat_lgnorm = c(all_dat_lgnorm, to_plot)
}

all_dat_twopt = c()
for (j in 1:num_bootstrap){
    filename = paste(filepath, "data_ss_twopt1/", var, "/fit_sim_",j,"type_0.oevo", sep="")
    temp = read_csv(filename, skip=1, col_names=FALSE)
    est_mean = mean(unlist(t(temp[1,2:101])))
    to_plot = unlist(t(temp[1,2:101]))/est_mean
    all_dat_twopt = c(all_dat_twopt, to_plot)
}

all_dat_gamma = c()
for (j in 1:num_bootstrap){
    filename = paste(filepath, "data_gam_exp_regress_dist/gamma/heritable/fit_sim_",j,"type_0.oevo", sep="")
    temp = read_csv(filename, skip=1, col_names=FALSE)
    est_mean = mean(unlist(t(temp[1,2:101])))
    to_plot = unlist(t(temp[1,2:101]))/est_mean
    all_dat_gamma = c(all_dat_gamma, to_plot)
}

all_dat_expo = c()
for (j in 1:num_bootstrap){
    filename = paste(filepath, "data_gam_exp_regress_dist/expo/heritable/fit_sim_",j,"type_0.oevo", sep="")
    temp = read_csv(filename, skip=1, col_names=FALSE)
    est_mean = mean(unlist(t(temp[1,2:101])))
    to_plot = unlist(t(temp[1,2:101]))/est_mean
    all_dat_expo = c(all_dat_expo, to_plot)
}
```

```{r include=FALSE}
filename = "./dists/cd8_norm.txt"
temp = read_tsv(filename, skip=1, col_names=FALSE)
cd8_dist = melt(temp)$value
filename = "./l1210_norm.txt"
temp = read_tsv(filename, skip=1, col_names=FALSE)
l1210_dist = melt(temp)$value
```

```{r include=FALSE}
gauss_kernel <- function(x){
  return(exp(-0.5*x^2)/sqrt(2*pi))
}

calculate_test_stat = function(theory_data, exp_data){
  theory_mean = mean(theory_data)
  theory_var = var(theory_data)
  exp_var = var(exp_data)
  exp_rescaled = (exp_data - mean(exp_data)) * sqrt(theory_var)/sqrt(exp_var)
  exp_rescaled = exp_rescaled + theory_mean
  
  kernel_bandwidth = 1.06 * ((length(exp_rescaled)-1) ^ (-1/5)) *  sqrt(theory_var)
  theoretical_bandwidth = 1.06 * ((length(theory_data)) ^ (-1/5)) *  sqrt(theory_var)
  
  f_squared = 0
  f_g = 0
  for (i in 1:length(exp_rescaled)){
    jackknife = exp_rescaled[-i]
    density_estimator = density(x=jackknife, bw=kernel_bandwidth, n=1, from=exp_rescaled[i], to=exp_rescaled[i])
    f_squared = f_squared + density_estimator$y
    theory_density = density(x=theory_data, bw=theoretical_bandwidth, n=1, from=exp_rescaled[i], to=exp_rescaled[i])
    f_g = f_g + theory_density$y
  }
  f_squared = f_squared/length(exp_rescaled)
  f_g = 2*f_g/length(exp_rescaled)

  to_integrate = function(x){
    theory_density = density(x=theory_data, bw=theoretical_bandwidth, n=1, from=x, to=x)
    return(theory_density$y[1]^2)
  }
  
  #theory_density = density(x=theory_data, bw=theoretical_bandwidth, n=10000)
  #step_size = theory_density$x[2] - theory_density$x[1]
  #g_squared = trap_sum(step_size, (theory_density$y)^2)
  #g_squared = g_squared[length(g_squared)]
  g_squared = integrate(Vectorize(to_integrate), lower = 0.5, upper = 2)$value
  MSE = f_squared + g_squared - f_g
  print(MSE)
  
  std_dev = 0
  for (i in 1:length(exp_rescaled)){
    for (j in 1:length(exp_rescaled)){
      std_dev = std_dev + gauss_kernel((exp_rescaled[i] - exp_rescaled[j])/kernel_bandwidth)^2
    }
  }
  return(MSE/sqrt(std_dev))
}
```

```{r include=FALSE}
lgnorm_z_score_cd8 = calculate_test_stat(all_dat_lgnorm, cd8_dist)
lgnorm_pval_cd8 = 2*pnorm(q=abs(lgnorm_z_score_cd8), lower.tail=FALSE)

lgnorm_z_score_l1210 = calculate_test_stat(all_dat_lgnorm, l1210_dist)
lgnorm_pval_l1210 = 2*pnorm(q=abs(lgnorm_z_score_l1210), lower.tail=FALSE)
```

The null hypothesis of these tests is that the appropriately scaled and shifted simulated fitness distribution (lognormal stochastic fitness alterations) is the same as the experimentally-measured fitness distribution. For the CD8+ fitness distribution, the p-value is `r lgnorm_pval_cd8`, with a z-score of `r lgnorm_z_score_cd8`. For the L1210 fitness distribution, the p-value is `r lgnorm_pval_l1210`, with a z-score of `r lgnorm_z_score_l1210`.